In this project, I intend to replicate a study by Grilli et al. (2015) titled Exploiting TIMSS and PIRLS combined data: multivariate multilevel modelling of student achievement. This study looked into student achievement in Italy using a multivariate multilevel modeling. I intend to replicate the study on my home country, Yemen. However, there is a limitation to my replication. Yemen only took the Trends in International Mathematics and Science Study (TIMSS) test but not PIRLS. Accordingly, the analyses I will perform will only be on TIMSS. Additionally, the last time Yemen took the TIMSS test was in 2011 and that is the data I plan to use in this analysis. As shown below, there are 144 schools that participated in the study, but not all have appropriate sample size. some schools have less than 30 students that took the test. These schools will be dropped automatically.
Click here for the original study.
Depedent Variables: Student achievement scores on math and science.
Independent Variables: Studnet demographical information, such as gender, family background, wealth of surrounding area.
Group Identifier: Schools
Below is an initial analysis of what the data looks like.
#rm(list=ls())
list.of.packages <- c("foreign", "lme4", "sjstats", "ggplot2", "lattice", "rockchalk", "stargazer")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages, require, character.only=TRUE)
## Loading required package: foreign
## Loading required package: lme4
## Loading required package: Matrix
## Loading required package: sjstats
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.11
## Current Matrix version is 1.2.12
## Please re-install 'TMB' from source or restore original 'Matrix' package
## Loading required package: ggplot2
## Loading required package: lattice
## Loading required package: rockchalk
## Loading required package: stargazer
##
## Please cite as:
## Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2. http://CRAN.R-project.org/package=stargazer
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] TRUE
##
## [[6]]
## [1] TRUE
##
## [[7]]
## [1] TRUE
YemenData <- read.spss("C:\\Users\\Abdulrazzag\\Desktop\\TIMSS Yemen\\Merged Data\\Yemen2007.sav", to.data.frame = TRUE)
## re-encoding from UTF-8
YemenData$IDSCHOOL <- as.factor(YemenData$IDSCHOOL)
#Calculate the number of students in every school
freq.school <- as.data.frame(table(unique(YemenData)$IDSCHOOL))
#Number of schools
length(levels(YemenData$IDSCHOOL))
## [1] 144
#Exclude schools with less than 30 students
schools <- freq.school[which(freq.school$Freq>30 & freq.school$Freq!=0),]
#Exclude the schools with less than 30 students from the dataframe
YemenData.filtered <- subset(YemenData,IDSCHOOL %in% schools$Var1)
#Check if the code worked by comparing the two vectors of school IDs
identical(unique(schools$Var1), unique(YemenData.filtered$IDSCHOOL))
## [1] TRUE
colnames(YemenData.filtered)[762] <- "ECONOMIC_DISADVANTAGE"
colnames(YemenData.filtered)[763] <- "ECONOMIC_AFFLUENCY"
cat("Original number of schools is:", length(unique(YemenData$IDSCHOOL)))
## Original number of schools is: 144
cat("\n", "New number of schools is:", length(unique(YemenData.filtered$IDSCHOOL)))
##
## New number of schools is: 121
# final.data <- YemenData.filtered[,c("IDSCHOOL", "IDSTUD", "IDTEACH", "ASMMAT01", "ASSSCI01", "AC4GSBED")]
# colnames(final.data) <- paste0(c("IDSCHOOL", "IDSTUD", "IDTEACH", "ASMMAT01", "ASSSCI01"))
# YemenData.Reshaped <- reshape(final.data, varying = list(c("ASMMAT01", "ASSSCI01")), timevar = "IDSCHOOL", idvar = c("IDSTUD", "IDTEACH"), direction = 'long', sep="")
# colnames(YemenData.Reshaped) <- paste0(c("IDSCHOOL", "IDSTUD", "IDTEACH", "AC4GSBED", "ASMMAT01"))
#
#head(YemenData.filtered)
ggplot(YemenData[as.numeric(YemenData$IDSCHOOL)<=15,], aes(x=ASSSCI01,y=ASMMAT03, color=IDSCHOOL)) +
geom_boxplot() +
facet_wrap(~IDCLASS)+
ylab("Science Achievement")
ggplot(YemenData[as.numeric(YemenData$IDSCHOOL)<=15,], aes(x=ASMMAT01,y=ASMMAT03, color=IDSCHOOL)) +
geom_point() +
facet_wrap(~IDCLASS)+
ylab("Math Achievement")
prop.table(table(YemenData.filtered$ITSEX))
##
## GIRL BOY
## 0.4531473 0.5468527
cat("\n")
cat("ECONOMIC_AFFLUENCY")
## ECONOMIC_AFFLUENCY
cat("\n")
prop.table(table(YemenData.filtered$ECONOMIC_AFFLUENCY))
##
## 0 TO 10 PERCENT 11 TO 25 PERCENT 26 TO 50 PERCENT
## 0.45638391 0.31099694 0.21917359
## MORE THAN 50 PERCENT
## 0.01344556
cat("\n")
cat("ECONOMIC_DISADVANTAGE")
## ECONOMIC_DISADVANTAGE
cat("\n")
prop.table(table(YemenData.filtered$ECONOMIC_DISADVANTAGE))
##
## 0 TO 10 PERCENT 11 TO 25 PERCENT 26 TO 50 PERCENT
## 0.0330179 0.1184941 0.2189879
## MORE THAN 50 PERCENT
## 0.6295001
model0 <- lmer(ASMMAT01 ~ (1|IDSCHOOL), data = YemenData.filtered)
model1 <- lmer( ASMMAT01~ ECONOMIC_DISADVANTAGE + ECONOMIC_AFFLUENCY + ITSEX + (1|IDSCHOOL), data = YemenData.filtered)
model2 <- lmer( ASSSCI01~ ECONOMIC_DISADVANTAGE + ECONOMIC_AFFLUENCY + ITSEX + (1|IDSCHOOL), data = YemenData.filtered)
msss <- ranef(model1, condVar=TRUE)
dotplot(msss)
## $IDSCHOOL
summary(model1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: ASMMAT01 ~ ECONOMIC_DISADVANTAGE + ECONOMIC_AFFLUENCY + ITSEX +
## (1 | IDSCHOOL)
## Data: YemenData.filtered
##
## REML criterion at convergence: 106941.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0136 -0.7043 -0.0359 0.6779 3.3455
##
## Random effects:
## Groups Name Variance Std.Dev.
## IDSCHOOL (Intercept) 4223 64.99
## Residual 7612 87.25
## Number of obs: 9052, groups: IDSCHOOL, 111
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 248.7188 43.0149 5.782
## ECONOMIC_DISADVANTAGE11 TO 25 PERCENT -34.5896 42.5606 -0.813
## ECONOMIC_DISADVANTAGE26 TO 50 PERCENT -46.6411 43.4869 -1.073
## ECONOMIC_DISADVANTAGEMORE THAN 50 PERCENT -36.6844 42.3566 -0.866
## ECONOMIC_AFFLUENCY11 TO 25 PERCENT 25.9220 15.3248 1.692
## ECONOMIC_AFFLUENCY26 TO 50 PERCENT -0.2786 19.5649 -0.014
## ECONOMIC_AFFLUENCYMORE THAN 50 PERCENT 26.8906 49.5883 0.542
## ITSEXBOY 4.0164 2.9273 1.372
##
## Correlation of Fixed Effects:
## (Intr) ECONOMIC_DT2P ECONOMIC_DISADVANTAGE2T5P
## ECONOMIC_DT2P -0.867
## ECONOMIC_DISADVANTAGE2T5P -0.905 0.841
## ECONOMIC_DISADVANTAGEMT5P -0.974 0.873 0.916
## ECONOMIC_AT2P -0.243 0.076 -0.028
## ECONOMIC_AFFLUENCY2T5P -0.300 -0.012 0.120
## ECONOMIC_AFFLUENCYMT5P -0.618 0.466 0.544
## ITSEXBOY -0.050 0.017 0.006
## ECONOMIC_DISADVANTAGEMT5P ECONOMIC_AT2P
## ECONOMIC_DT2P
## ECONOMIC_DISADVANTAGE2T5P
## ECONOMIC_DISADVANTAGEMT5P
## ECONOMIC_AT2P 0.121
## ECONOMIC_AFFLUENCY2T5P 0.217 0.385
## ECONOMIC_AFFLUENCYMT5P 0.595 0.188
## ITSEXBOY 0.008 0.040
## ECONOMIC_AFFLUENCY2T5P ECONOMIC_AFFLUENCYMT5P
## ECONOMIC_DT2P
## ECONOMIC_DISADVANTAGE2T5P
## ECONOMIC_DISADVANTAGEMT5P
## ECONOMIC_AT2P
## ECONOMIC_AFFLUENCY2T5P
## ECONOMIC_AFFLUENCYMT5P 0.263
## ITSEXBOY 0.005 0.018
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: ASSSCI01 ~ ECONOMIC_DISADVANTAGE + ECONOMIC_AFFLUENCY + ITSEX +
## (1 | IDSCHOOL)
## Data: YemenData.filtered
##
## REML criterion at convergence: 109921.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8341 -0.7393 -0.0918 0.6591 3.8430
##
## Random effects:
## Groups Name Variance Std.Dev.
## IDSCHOOL (Intercept) 6212 78.82
## Residual 10576 102.84
## Number of obs: 9052, groups: IDSCHOOL, 111
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 263.242 52.128 5.050
## ECONOMIC_DISADVANTAGE11 TO 25 PERCENT -72.402 51.578 -1.404
## ECONOMIC_DISADVANTAGE26 TO 50 PERCENT -82.697 52.704 -1.569
## ECONOMIC_DISADVANTAGEMORE THAN 50 PERCENT -79.246 51.334 -1.544
## ECONOMIC_AFFLUENCY11 TO 25 PERCENT 31.794 18.572 1.712
## ECONOMIC_AFFLUENCY26 TO 50 PERCENT 4.401 23.713 0.186
## ECONOMIC_AFFLUENCYMORE THAN 50 PERCENT 11.014 60.082 0.183
## ITSEXBOY 4.086 3.453 1.183
##
## Correlation of Fixed Effects:
## (Intr) ECONOMIC_DT2P ECONOMIC_DISADVANTAGE2T5P
## ECONOMIC_DT2P -0.867
## ECONOMIC_DISADVANTAGE2T5P -0.905 0.841
## ECONOMIC_DISADVANTAGEMT5P -0.974 0.873 0.916
## ECONOMIC_AT2P -0.243 0.076 -0.028
## ECONOMIC_AFFLUENCY2T5P -0.300 -0.012 0.120
## ECONOMIC_AFFLUENCYMT5P -0.618 0.466 0.544
## ITSEXBOY -0.049 0.016 0.006
## ECONOMIC_DISADVANTAGEMT5P ECONOMIC_AT2P
## ECONOMIC_DT2P
## ECONOMIC_DISADVANTAGE2T5P
## ECONOMIC_DISADVANTAGEMT5P
## ECONOMIC_AT2P 0.121
## ECONOMIC_AFFLUENCY2T5P 0.217 0.385
## ECONOMIC_AFFLUENCYMT5P 0.595 0.188
## ITSEXBOY 0.008 0.039
## ECONOMIC_AFFLUENCY2T5P ECONOMIC_AFFLUENCYMT5P
## ECONOMIC_DT2P
## ECONOMIC_DISADVANTAGE2T5P
## ECONOMIC_DISADVANTAGEMT5P
## ECONOMIC_AT2P
## ECONOMIC_AFFLUENCY2T5P
## ECONOMIC_AFFLUENCYMT5P 0.263
## ITSEXBOY 0.005 0.018
icc(model0)
## Linear mixed model
## Family: gaussian (identity)
## Formula: ASMMAT01 ~ (1 | IDSCHOOL)
##
## ICC (IDSCHOOL): 0.356449
cat("\n")
levels(YemenData.filtered$ECONOMIC_DISADVAntage)
## NULL
levels(YemenData.filtered$ECONOMIC_AFFLUENCY)
## [1] "0 TO 10 PERCENT" "11 TO 25 PERCENT" "26 TO 50 PERCENT"
## [4] "MORE THAN 50 PERCENT"
cat("Math Achievement model")
## Math Achievement model
stargazer(model1,type ="text", p.auto = TRUE)
##
## =====================================================================
## Dependent variable:
## ---------------------------
## ASMMAT01
## ---------------------------------------------------------------------
## ECONOMIC_DISADVANTAGE11 TO 25 PERCENT -34.590
## (42.561)
##
## ECONOMIC_DISADVANTAGE26 TO 50 PERCENT -46.641
## (43.487)
##
## ECONOMIC_DISADVANTAGEMORE THAN 50 PERCENT -36.684
## (42.357)
##
## ECONOMIC_AFFLUENCY11 TO 25 PERCENT 25.922*
## (15.325)
##
## ECONOMIC_AFFLUENCY26 TO 50 PERCENT -0.279
## (19.565)
##
## ECONOMIC_AFFLUENCYMORE THAN 50 PERCENT 26.891
## (49.588)
##
## ITSEXBOY 4.016
## (2.927)
##
## Constant 248.719***
## (43.015)
##
## ---------------------------------------------------------------------
## Observations 9,052
## Log Likelihood -53,470.760
## Akaike Inf. Crit. 106,961.500
## Bayesian Inf. Crit. 107,032.600
## =====================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
cat("Science Achievement model")
## Science Achievement model
stargazer(model2,type ="text", p.auto = TRUE)
##
## =====================================================================
## Dependent variable:
## ---------------------------
## ASSSCI01
## ---------------------------------------------------------------------
## ECONOMIC_DISADVANTAGE11 TO 25 PERCENT -72.402
## (51.578)
##
## ECONOMIC_DISADVANTAGE26 TO 50 PERCENT -82.697
## (52.704)
##
## ECONOMIC_DISADVANTAGEMORE THAN 50 PERCENT -79.246
## (51.334)
##
## ECONOMIC_AFFLUENCY11 TO 25 PERCENT 31.794*
## (18.572)
##
## ECONOMIC_AFFLUENCY26 TO 50 PERCENT 4.401
## (23.713)
##
## ECONOMIC_AFFLUENCYMORE THAN 50 PERCENT 11.014
## (60.082)
##
## ITSEXBOY 4.086
## (3.453)
##
## Constant 263.242***
## (52.128)
##
## ---------------------------------------------------------------------
## Observations 9,052
## Log Likelihood -54,960.560
## Akaike Inf. Crit. 109,941.100
## Bayesian Inf. Crit. 110,012.200
## =====================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#outreg(list(model1), type = "html")
As we saw, the intraclass correlation coefficient is pretty high indicating multilevel data.
confint(model1)
## Computing profile confidence intervals ...
## 2.5 % 97.5 %
## .sig01 55.094148 72.388656
## .sigma 85.981752 88.539791
## (Intercept) 166.455215 331.029228
## ECONOMIC_DISADVANTAGE11 TO 25 PERCENT -116.010641 46.823714
## ECONOMIC_DISADVANTAGE26 TO 50 PERCENT -129.812737 36.559782
## ECONOMIC_DISADVANTAGEMORE THAN 50 PERCENT -117.708517 44.340362
## ECONOMIC_AFFLUENCY11 TO 25 PERCENT -3.411267 55.221250
## ECONOMIC_AFFLUENCY26 TO 50 PERCENT -37.702115 37.146680
## ECONOMIC_AFFLUENCYMORE THAN 50 PERCENT -67.986709 121.758162
## ITSEXBOY -1.839272 9.659041
confint(model2)
## Computing profile confidence intervals ...
## 2.5 % 97.5 %
## .sig01 66.852457 87.77710
## .sigma 101.346272 104.36138
## (Intercept) 163.533366 362.98430
## ECONOMIC_DISADVANTAGE11 TO 25 PERCENT -171.075412 26.27024
## ECONOMIC_DISADVANTAGE26 TO 50 PERCENT -183.495093 18.15101
## ECONOMIC_DISADVANTAGEMORE THAN 50 PERCENT -177.442952 18.96087
## ECONOMIC_AFFLUENCY11 TO 25 PERCENT -3.757090 67.30213
## ECONOMIC_AFFLUENCY26 TO 50 PERCENT -40.957227 49.76655
## ECONOMIC_AFFLUENCYMORE THAN 50 PERCENT -103.941948 125.96792
## ITSEXBOY -2.789179 10.76092